Step 1. Pre-Processing our data!¶

InĀ [1]:
# Got to load in our dependencies!
import pandas as pd
import plotly.express as px
import plotly.offline as pyo
import seaborn as sns
import plotly.graph_objects as go
import numpy as np
from pandas.api.types import is_numeric_dtype
from great_tables import GT, md, html, system_fonts, style, loc
InĀ [2]:
# First, we can load in our dataframes, starting with the overall look at all of the listings within the city and its general information for 2024
nov_listings: pd.DataFrame = pd.read_csv('./datasets/new_york_listings.csv')

# For historical reasons, let's also load in the same listings dataset but from July 2023
jul_23_listings: pd.DataFrame = pd.read_csv('./datasets/NYC-Airbnb-2023.csv')
C:\Users\ernie\AppData\Local\Temp\ipykernel_84516\4071149682.py:5: DtypeWarning:

Columns (17) have mixed types. Specify dtype option on import or set low_memory=False.

InĀ [3]:
nov_listings.head(3)
Out[3]:
id listing_url scrape_id last_scraped source name description neighborhood_overview picture_url host_id ... review_scores_communication review_scores_location review_scores_value license instant_bookable calculated_host_listings_count calculated_host_listings_count_entire_homes calculated_host_listings_count_private_rooms calculated_host_listings_count_shared_rooms reviews_per_month
0 2595 https://www.airbnb.com/rooms/2595 20241104040953 2024-11-04 city scrape Skylit Midtown Castle Sanctuary Beautiful, spacious skylit studio in the heart... Centrally located in the heart of Manhattan ju... https://a0.muscache.com/pictures/miso/Hosting-... 2845 ... 4.8 4.81 4.40 NaN f 3 3 0 0 0.27
1 6848 https://www.airbnb.com/rooms/6848 20241104040953 2024-11-04 city scrape Only 2 stops to Manhattan studio Comfortable studio apartment with super comfor... NaN https://a0.muscache.com/pictures/e4f031a7-f146... 15991 ... 4.8 4.69 4.58 NaN f 1 1 0 0 1.04
2 6872 https://www.airbnb.com/rooms/6872 20241104040953 2024-11-04 city scrape Uptown Sanctuary w/ Private Bath (Month to Month) This charming distancing-friendly month-to-mon... This sweet Harlem sanctuary is a 10-20 minute ... https://a0.muscache.com/pictures/miso/Hosting-... 16104 ... 5.0 5.00 5.00 NaN f 2 0 2 0 0.03

3 rows Ɨ 75 columns

InĀ [4]:
jul_23_listings.head(3)
Out[4]:
id name host_id host_name neighbourhood_group neighbourhood latitude longitude room_type price minimum_nights number_of_reviews last_review reviews_per_month calculated_host_listings_count availability_365 number_of_reviews_ltm license
0 2595 Skylit Midtown Castle 2845 Jennifer Manhattan Midtown 40.75356 -73.98559 Entire home/apt 150 30 49 2022-06-21 0.30 3 314 1 NaN
1 5121 BlissArtsSpace! 7356 Garon Brooklyn Bedford-Stuyvesant 40.68535 -73.95512 Private room 60 30 50 2019-12-02 0.30 2 365 0 NaN
2 5203 Cozy Clean Guest Room - Family Apt 7490 MaryEllen Manhattan Upper West Side 40.80380 -73.96751 Private room 75 2 118 2017-07-21 0.72 1 0 0 NaN
InĀ [5]:
# I like to do this because sometimes the columns have whitespace or weird capitalization you don't even realize
nov_listings.columns = np.vectorize(lambda x: x.strip().lower())(nov_listings.columns)

# Our first pre-processing step can just be dropping columns we definitely, 100% don't need for our analysis
print(f'nov_listings currently has {len(nov_listings.columns)} columns')
nov_listings.drop(
    columns=['picture_url', 
             'host_url',
             'neighbourhood', #Not really the neighborhood 
             'host_thumbnail_url', 
             'host_picture_url', 
             'host_has_profile_pic', 
             'host_identity_verified',
             'license',
             ],
    inplace = True
)

# Take the dollar sign out of the price column so we can do some analysis with it
nov_listings['price'] = nov_listings["price"].apply(lambda x: float(x.replace('$', '').replace(',','') if isinstance(x, str) else x))

print(f'nov_listings currently has {len(nov_listings.columns)} columns')
nov_listings currently has 75 columns
nov_listings currently has 67 columns
InĀ [6]:
# Can do the same for the 2023 listings, although there is a lot less data
jul_23_listings.columns = np.vectorize(lambda x: x.strip().lower())(jul_23_listings.columns)

print(f'jul_23_listings currently has {len(jul_23_listings.columns)} columns')
jul_23_listings.drop(
        columns=['license', 'number_of_reviews_ltm'],
        inplace=True)

print(f'jul_23_listings currently has {len(jul_23_listings.columns)} columns')
jul_23_listings currently has 18 columns
jul_23_listings currently has 16 columns
InĀ [16]:
# Now, let's take a look at NaN values for each dataframe
print(f'The number of NaN values per column in nov_listings: \n{nov_listings.isna().sum().sort_values(ascending=False)[:11]}')

print(f'The number of NaN values per column in jul_23_listings: \n{jul_23_listings.isna().sum().sort_values(ascending=False)}')

# That's helpful, but doesn't really give me a gauge on what percentage of rows this is, so let's calculate that!
nov_nan_percentages: pd.DataFrame = pd.DataFrame((nov_listings.isna().sum().sort_values(ascending=False) / len(nov_listings.index) * 100).round(2), columns=['Missing Value Percentage']).reset_index(names=['Column Name'])

nov_nan_table: GT = (
    GT(nov_nan_percentages[:10])
    .tab_header(
        title = html("<span style='font-size:20px; font-weight:bold;'>Missing values for the November 2024 Table</span>"),
        subtitle = html("<span style='font-size:15px; font-weight:bold;'>Sorted by the Percentage of the Column Missing</span>"),
    )
    .tab_options(
        table_font_names=system_fonts("industrial")
    )
    .data_color(
        columns=['Missing Value Percentage'],
        palette = 'RdPu',
    )
)

jul_nan_percentages: pd.DataFrame = pd.DataFrame((jul_23_listings.isna().sum().sort_values(ascending=False)[:20] / len(jul_23_listings.index) * 100).round(2), columns=['Missing Value Percentage']).reset_index(names=['Column Name'])

jul_nan_table: GT = (
    GT(jul_nan_percentages[:10])
    .tab_header(
        title = html("<span style='font-size:20px; font-weight:bold;'>Missing values for the July 2023 Table</span>"),
        subtitle = html("<span style='font-size:15px; font-weight:bold;'>Sorted by the Percentage of the Column Missing</span>"),
    )
    .tab_options(
        table_font_names=system_fonts("industrial"),
    )
    .data_color(
        columns=['Missing Value Percentage'],
        palette = 'RdPu',
    )
)

nov_nan_table.show()
jul_nan_table.show()
The number of NaN values per column in nov_listings: 
neighborhood_overview    16974
host_about               16224
host_response_time       15001
host_response_rate       15001
host_acceptance_rate     14983
last_review              11560
first_review             11560
host_location             7999
host_neighbourhood        7503
has_availability          5367
description               1044
dtype: int64
The number of NaN values per column in jul_23_listings: 
last_review                       10304
reviews_per_month                 10304
name                                 12
host_name                             5
neighbourhood_group                   0
neighbourhood                         0
id                                    0
host_id                               0
longitude                             0
latitude                              0
room_type                             0
price                                 0
number_of_reviews                     0
minimum_nights                        0
calculated_host_listings_count        0
availability_365                      0
dtype: int64
Missing values for the November 2024 Table
Sorted by the Percentage of the Column Missing
Column Name Missing Value Percentage
neighborhood_overview 45.21
host_about 43.21
host_response_time 39.95
host_response_rate 39.95
host_acceptance_rate 39.9
last_review 30.79
first_review 30.79
host_location 21.3
host_neighbourhood 19.98
has_availability 14.29
Missing values for the July 2023 Table
Sorted by the Percentage of the Column Missing
Column Name Missing Value Percentage
last_review 24.0
reviews_per_month 24.0
name 0.03
host_name 0.01
neighbourhood_group 0.0
neighbourhood 0.0
id 0.0
host_id 0.0
longitude 0.0
latitude 0.0

This creates an interesting dilemma; we can definitely drop calendar_updated, but what about the columns that have a noticeable proportion of their values missing? We can fill them in based upon the median, that is pretty easy, but I wanted to take a different approach given that I am taking a geography-centered point of view for this project: fill them based upon the median for that column within their borough. I think this can create a more accurate view without getting so specific that we are filling them based upon similar values in their neighborhood (which might have only a handful of values).

InĀ [8]:
# First, let's get rid of that pesky calendar_updated column
nov_listings.drop(columns=['calendar_updated'], inplace=True)

# Let's get all the numerical columns with more than 30% of their values missing
missing_columns = [name for name, val in (nov_listings.isna().sum().sort_values(ascending=False) / len(nov_listings.index) * 100).items() if val > .3 and is_numeric_dtype(nov_listings[name])]

# Create a function that can replace values in a column based for each borough
def fill_na_with_group_means(df: pd.DataFrame, col: str, group_col: str = 'neighbourhood_group_cleansed') -> pd.Series:
    """ Returns a dictionary with the median for the grouped column that can be used to fill NaN values

    Args:
        df (pd.DataFrame): dataframe to utilize
        col (str): column to take the median of 
        group_col (str, optional): column to group by Defaults to 'neighbourhood_group_cleansed'.

    Returns:
        pd.Series: series with the indexes as the grouped_by indexes and the values as the medians of each group for the specified column
    """
    # print(df.groupby(group_col)[col].transform('median'))
    return df[col].fillna(df.groupby(group_col)[col].transform('median'))

# Do it for every missing column
for col in missing_columns:
    nov_listings[col] = fill_na_with_group_means(nov_listings, col)

From here, we would typically do the same for jul_23_listings, but based upon the analysis above, there aren't many important columns for null values at all, so we can leave that.

Step 2. Visualizations¶

InĀ [9]:
# Set plotly to offline mode so we can display these visualizations
pyo.init_notebook_mode()
InĀ [10]:
# For our first visualization, it might be helpful to look at for each borough how the number of listings have changed from 2023 to now
num_nov_of_listings: pd.Series = nov_listings.groupby('neighbourhood_group_cleansed').size()

num_jul_23_listings: pd.Series = jul_23_listings.groupby('neighbourhood_group').size()

total_diff: float = ((len(nov_listings) - len(jul_23_listings)) / len(jul_23_listings)) * -100

listings_change: pd.DataFrame = pd.DataFrame({
    'borough': num_nov_of_listings.index,
    'July 2023': num_jul_23_listings,
    'November 2024': num_nov_of_listings
})

listings_change['percent_change'] = ((listings_change['November 2024'] - listings_change['July 2023']) / listings_change['July 2023']) * 100

listings_change_fig = px.bar(
    data_frame = listings_change, 
    x = 'borough', 
    template='plotly_dark', 
    y = ['July 2023', 'November 2024'],
    barmode='group', 
    labels = {'borough': 'Borough', 'variable': 'Month'}, 
    color_discrete_map={'July 2023': '#a1c9f4', 'November 2024': '#8de5a1'} 
)

# Update layout
listings_change_fig.update_layout(
    yaxis = dict(title=dict(text='Number of Airbnb Listings')),
    font_family = "Raleway, sans-serif",
    title = dict(text=f'<b>How Local Law 18 changed the number of Airbnbs across NYC boroughs</b><br><sup>The number across the city decreased by {total_diff:.2f}% but that decrease varied across boroughs</sup>'),
    legend = dict (
        x = .5,
        y = -.3,
        orientation = 'h',
        yanchor = 'bottom',
        xanchor = 'center'
    )
)

# Add percent change information above each bar
for i, row in listings_change.iterrows():
    listings_change_fig.add_trace(go.Scatter(
        x=[row['borough']],
        y = [row['July 2023']],
        text=[f"{row['percent_change']:.2f}%", f"{row['percent_change']:.2f}%"],
        mode="text",
        showlegend=False,
        textfont = dict(weight=600)
    ))

# Have to do this weird thing where we save the image and then display it within the notebook because plotly graphs mess up my blog lol
listings_change_fig.write_image("../img/listings_change.png", scale=6, engine="kaleido")

Listings Change